Label = ob

Dados

[ ]:
df_SP = read_csv('/content/drive/MyDrive/Trabalho/Cancer/Datasets/cancer_SP_labels.csv')
df_fora = read_csv('/content/drive/MyDrive/Trabalho/Cancer/Datasets/cancer_foraSP_labels.csv')
(806402, 94)
(62317, 94)

SP

[ ]:
df_SP.isna().sum().sort_values(ascending=False).head(6)
DTRECIDIVA    717377
delta_t6      717377
delta_t5      717377
delta_t4      717377
IDADE              1
TRATAMENTO         0
dtype: int64
[ ]:
corr_matrix = df_SP.corr()
abs(corr_matrix['ob']).sort_values(ascending = False).head(20)
ob              1.000000
vivo_sem_rec    0.946549
ob_sem_rec      0.870247
ULTINFO         0.868236
ano_ob          0.750408
PERDASEG        0.378763
ob_com_rec      0.319736
ANODIAG         0.269601
CIRURGIA        0.249248
QUIMIO          0.244519
delta_t7        0.216269
delta_t8        0.216105
delta_t9        0.212633
CATEATEND       0.212081
RECNENHUM       0.196724
delta_t5        0.169360
delta_t4        0.167767
delta_t6        0.160751
RECREGIO        0.141924
IDADE           0.136383
Name: ob, dtype: float64
[ ]:
df_SP.ob.value_counts()
0    480724
1    325678
Name: ob, dtype: int64
[ ]:
df_SP['ob'][df_SP.PERDASEG == 1].value_counts()
0    140925
1         1
Name: ob, dtype: int64

Fora de SP

[ ]:
df_fora.isna().sum().sort_values(ascending=False).head(6)
DTRECIDIVA    57799
delta_t6      57799
delta_t5      57799
delta_t4      57799
NAOTRAT           0
LOCALTNM          0
dtype: int64
[ ]:
corr_matrix = df_fora.corr()
abs(corr_matrix['ob']).sort_values(ascending = False).head(20)
ob              1.000000
vivo_sem_rec    0.949190
ob_sem_rec      0.928200
ULTINFO         0.854305
ano_ob          0.772548
PERDASEG        0.359090
ob_com_rec      0.275720
CIRURGIA        0.257715
QUIMIO          0.253681
ANODIAG         0.252079
CATEATEND       0.209850
delta_t5        0.168003
delta_t8        0.167875
delta_t4        0.166380
delta_t7        0.165922
delta_t9        0.164855
delta_t6        0.163108
RECNENHUM       0.149262
RADIO           0.139862
GLEASON         0.104262
Name: ob, dtype: float64
[ ]:
df_fora.ob.value_counts()
0    44591
1    17726
Name: ob, dtype: int64
[ ]:
df_fora['ob'][df_fora.PERDASEG == 1].value_counts()
0    15263
Name: ob, dtype: int64

Divisão em treino e teste

[ ]:
list_drop = ['UFRESID', 'DTCONSULT', 'DTDIAG', 'DTTRAT', 'DTRECIDIVA', 'DTULTINFO',
              'IDADE', 'PERDASEG', 'CONSDIAG', 'TRATCONS', 'DIAGTRAT', 'delta_t4',
              'delta_t5', 'delta_t6', 'delta_t7', 'delta_t8', 'delta_t9', 'ano_ob',
              'ob_com_rec', 'ob_sem_rec', 'vivo_com_rec', 'vivo_sem_rec', 'ULTINFO']

lb = 'ob'

SP

[ ]:
X_trainSP, X_testSP, y_trainSP, y_testSP = get_train_test(df_SP, list_drop, lb)
X_train = (604801, 70), X_test = (201601, 70)
y_train = (604801,), y_test = (201601,)

Fora de SP

[ ]:
X_trainFora, X_testFora, y_trainFora, y_testFora = get_train_test(df_fora, list_drop, lb)
X_train = (46737, 70), X_test = (15580, 70)
y_train = (46737,), y_test = (15580,)

Encoder e normalização

SP

[ ]:
X_trainSP_enc, enc_SP, norm_SP = train_preprocessing(X_trainSP, normalizer='StandardScaler')

Fora de SP

[ ]:
X_trainFora_enc, enc_fora, norm_fora = train_preprocessing(X_trainFora, normalizer='StandardScaler')

PCA

SP

[ ]:
pca = PCA()
principalComponents = pca.fit_transform(X_trainSP_enc)
[ ]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=np.linspace(1, 70, 70),
        y=np.cumsum(pca.explained_variance_ratio_),
        line_shape='hv',

    ))

fig.add_trace(
    go.Bar(
        x=np.linspace(1, 70, 70),
        y=pca.explained_variance_ratio_
    ))
fig.update_layout(yaxis_title='Variância que pode ser "explicada"', xaxis_title='Número de componentes principais')

fig.show()

Fora de SP

[ ]:
pca = PCA()
principalComponents = pca.fit_transform(X_trainFora_enc)
[ ]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=np.linspace(1, 70, 70),
        y=np.cumsum(pca.explained_variance_ratio_),
        line_shape='hv',

    ))

fig.add_trace(
    go.Bar(
        x=np.linspace(1, 70, 70),
        y=pca.explained_variance_ratio_
    ))
fig.update_layout(yaxis_title='Variância que pode ser "explicada"', xaxis_title='Número de componentes principais')

fig.show()

Balanceamento dos dados

SP

[ ]:
# Quantidade de valores para cada label nos dados de treino
y_trainSP.value_counts()
0    360572
1    244229
Name: ob, dtype: int64
[ ]:
rus = RandomUnderSampler(random_state=seed)
X_SP, y_SP = rus.fit_sample(X_trainSP_enc, y_trainSP)
[ ]:
y_SP.shape
(488458,)

Fora de SP

[ ]:
# Quantidade de valores para cada label nos dados de treino
y_trainFora.value_counts()
0    33365
1    13372
Name: ob, dtype: int64
[ ]:
X_fora, y_fora = SMOTE(random_state=seed).fit_resample(X_trainFora_enc, y_trainFora)
[ ]:
y_fora.shape
(66730,)

Treinamento dos modelos de Machine Learning

Random Forest

[ ]:
# SP
rf_sp = RandomForestClassifier(random_state=seed)
rf_sp.fit(X_SP, y_SP)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=10, verbose=0,
                       warm_start=False)
[ ]:
# Fora
rf_fora = RandomForestClassifier(random_state=seed, class_weight={0: 10, 1: 0.0001})
rf_fora.fit(X_fora, y_fora)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 10, 1: 0.0001}, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=10, verbose=0, warm_start=False)

XGBoost

[ ]:
# SP
xgboost_sp = xgb.XGBClassifier(max_depth=15, random_state=seed)

xgboost_sp.fit(X_SP, y_SP)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=10,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
[ ]:
# Fora de SP
xgboost_fora = xgb.XGBClassifier(max_depth=15, scale_pos_weight=10,
                                 random_state=seed)

xgboost_fora.fit(X_fora, y_fora)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=10,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=10, seed=None,
              silent=None, subsample=1, verbosity=1)

Salvando modelos

[ ]:
#with open('/content/drive/MyDrive/Trabalho/Cancer/Modelos/models_SP.pkl', 'wb') as arq:
    #pickle.dump(
        #{'X_train': X_SP, 'y_train': y_SP, 'Encoders': enc_SP,
         #'Normalizer': norm_SP, 'Random Forest': rf_sp, 'XGB': xgboost_sp}, arq)
[ ]:
#with open('/content/drive/MyDrive/Trabalho/Cancer/Modelos/models_foraSP.pkl', 'wb') as arq:
    #pickle.dump(
        #{'X_train': X_fora, 'y_train': y_fora, 'Encoders': enc_fora,
         #'Normalizer': norm_fora, 'Random Forest': rf_fora, 'XGB': xgboost_fora}, arq)

Validação dos modelos

Pré-processamento dos dados de teste

[ ]:
# SP
X_testSP_ = test_preprocessing(X_testSP, enc_SP, norm_SP)
[ ]:
# Fora de SP
X_testFora_ = test_preprocessing(X_testFora, enc_fora, norm_fora)

Random Forest

[ ]:
plot_confusion_matrix(rf_sp, X_testSP_, y_testSP, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
_images/Cancer_Classifiers_55_0.png
[ ]:
plot_feat_importances(rf_sp, X_testSP)
_images/Cancer_Classifiers_56_0.png
[ ]:
plot_confusion_matrix(rf_fora, X_testFora_, y_testFora, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
_images/Cancer_Classifiers_57_0.png
[ ]:
plot_feat_importances(rf_fora, X_testFora)
_images/Cancer_Classifiers_58_0.png

XGBoost

[ ]:
plot_confusion_matrix(xgboost_sp, X_testSP_, y_testSP, cmap='Blues', normalize='true', values_format='.2f')
plt.show()
_images/Cancer_Classifiers_60_0.png
[ ]:
plot_feat_importances(xgboost_sp, X_testSP)
_images/Cancer_Classifiers_61_0.png
[ ]:
plot_confusion_matrix(xgboost_fora, X_testFora_, y_testFora, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
_images/Cancer_Classifiers_62_0.png
[ ]:
plot_feat_importances(xgboost_fora, X_testFora)
_images/Cancer_Classifiers_63_0.png

Label = RECNENHUM

Dados

[ ]:
df_SP = read_csv('/content/drive/MyDrive/Trabalho/Cancer/Datasets/cancer_SP_labels.csv')
df_fora = read_csv('/content/drive/MyDrive/Trabalho/Cancer/Datasets/cancer_foraSP_labels.csv')
(806402, 94)
(62317, 94)

SP

[ ]:
df_SP.isna().sum().sort_values(ascending=False).head(6)
DTRECIDIVA    717377
delta_t6      717377
delta_t5      717377
delta_t4      717377
IDADE              1
TRATAMENTO         0
dtype: int64
[ ]:
corr_matrix = df_SP.corr()
abs(corr_matrix['RECNENHUM']).sort_values(ascending = False).head(20)
RECNENHUM       1.000000
ob_com_rec      0.829363
RECLOCAL        0.679652
RECREGIO        0.529338
vivo_com_rec    0.522025
RECDIST         0.497837
vivo_sem_rec    0.364914
ano_ob          0.242833
ob_sem_rec      0.227291
ob              0.196724
QUIMIO          0.147652
RADIO           0.085910
ULTINFO         0.081964
PERDASEG        0.070304
delta_t8        0.064934
delta_t7        0.063559
delta_t9        0.063466
IBGEATEN        0.062526
delta_t5        0.058940
delta_t4        0.055412
Name: RECNENHUM, dtype: float64

Fora de SP

[ ]:
df_fora.isna().sum().sort_values(ascending=False).head(6)
DTRECIDIVA    57799
delta_t6      57799
delta_t5      57799
delta_t4      57799
NAOTRAT           0
LOCALTNM          0
dtype: int64
[ ]:
corr_matrix = df_fora.corr()
abs(corr_matrix['RECNENHUM']).sort_values(ascending = False).head(20)
RECNENHUM       1.000000
ob_com_rec      0.749663
RECLOCAL        0.695533
vivo_com_rec    0.642191
RECDIST         0.579667
RECREGIO        0.430531
vivo_sem_rec    0.349104
ano_ob          0.217344
ob              0.149262
ob_sem_rec      0.135709
QUIMIO          0.130924
delta_t8        0.104972
delta_t9        0.103801
delta_t7        0.103667
delta_t5        0.087167
delta_t4        0.082168
delta_t6        0.081259
IDADE           0.074725
RADIO           0.074704
DIAGPREV        0.062018
Name: RECNENHUM, dtype: float64

Divisão em treino e teste

[ ]:
df_SP.RECNENHUM.value_counts()
1    732633
0     73769
Name: RECNENHUM, dtype: int64
[ ]:
n_samples = 400000
df_SP_rec = df_SP[df_SP.RECNENHUM == 1].sample(n_samples, random_state=seed).sort_index()
df_SP_sem_rec = df_SP[df_SP.RECNENHUM == 0]
df_SP_menor = pd.concat([df_SP_rec, df_SP_sem_rec]).sort_index()
df_SP_menor.RECNENHUM.value_counts()
1    400000
0     73769
Name: RECNENHUM, dtype: int64
[ ]:
df_fora.RECNENHUM.value_counts()
1    59137
0     3180
Name: RECNENHUM, dtype: int64
[ ]:
list_drop = ['UFRESID', 'DTCONSULT', 'DTDIAG', 'DTTRAT', 'DTRECIDIVA', 'DTULTINFO',
             'IDADE', 'PERDASEG', 'CONSDIAG', 'TRATCONS', 'DIAGTRAT', 'RECLOCAL',
             'RECREGIO', 'RECDIST', 'REC01', 'REC02', 'REC03', 'REC04', 'delta_t4',
             'delta_t5', 'delta_t6', 'delta_t7', 'delta_t8', 'delta_t9', 'ob', 'ano_ob',
             'ob_com_rec', 'ob_sem_rec', 'vivo_com_rec', 'vivo_sem_rec', 'ULTINFO']

label = 'RECNENHUM'

SP

[ ]:
X_trainSP, X_testSP, y_trainSP, y_testSP = get_train_test(df_SP_menor, list_drop, label)
X_train = (355326, 62), X_test = (118443, 62)
y_train = (355326,), y_test = (118443,)

Fora de SP

[ ]:
X_trainFora, X_testFora, y_trainFora, y_testFora = get_train_test(df_fora, list_drop, label)
X_train = (46737, 62), X_test = (15580, 62)
y_train = (46737,), y_test = (15580,)

Encoder e normalização

SP

[ ]:
X_trainSP_enc, enc_SP, norm_SP = train_preprocessing(X_trainSP, normalizer='StandardScaler')

Fora de SP

[ ]:
X_trainFora_enc, enc_fora, norm_fora = train_preprocessing(X_trainFora, normalizer='StandardScaler')

PCA

SP

[ ]:
pca = PCA()
principalComponents = pca.fit_transform(X_trainSP_enc)

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=np.linspace(1, X_trainSP_enc.shape[0], X_trainSP_enc.shape[0]),
        y=np.cumsum(pca.explained_variance_ratio_),
        line_shape='hv',

    ))

fig.add_trace(
    go.Bar(
        x=np.linspace(1, X_trainSP_enc.shape[0], X_trainSP_enc.shape[0]),
        y=pca.explained_variance_ratio_
    ))
fig.update_layout(yaxis_title='Variância que pode ser "explicada"', xaxis_title='Número de componentes principais')

fig.show()

Fora de SP

[ ]:
pca = PCA()
principalComponents = pca.fit_transform(X_trainFora_enc)

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=np.linspace(1, X_trainFora_enc.shape[0], X_trainFora_enc.shape[0]),
        y=np.cumsum(pca.explained_variance_ratio_),
        line_shape='hv',

    ))

fig.add_trace(
    go.Bar(
        x=np.linspace(1, X_trainFora_enc.shape[0], X_trainFora_enc.shape[0]),
        y=pca.explained_variance_ratio_
    ))
fig.update_layout(yaxis_title='Variância que pode ser "explicada"', xaxis_title='Número de componentes principais')

fig.show()

Balanceamento dos dados

SP

[ ]:
# Quantidade de valores para cada label nos dados de treino
y_trainSP.value_counts()
1    300057
0     55269
Name: RECNENHUM, dtype: int64
[ ]:
X_SP, y_SP = SMOTE(random_state=seed).fit_resample(X_trainSP_enc, y_trainSP)
[ ]:
y_SP.shape
(600114,)

Fora de SP

[ ]:
# Quantidade de valores para cada label nos dados de treino
y_trainFora.value_counts()
1    44367
0     2370
Name: RECNENHUM, dtype: int64
[ ]:
X_fora, y_fora = SMOTE(random_state=seed).fit_resample(X_trainFora_enc, y_trainFora)
[ ]:
y_fora.shape
(88734,)

Treinamento dos modelos de Machine Learning

Random Forest

[ ]:
# SP
rf_sp = RandomForestClassifier(random_state=seed,
                               class_weight={0: 20, 1: 0.00006})
rf_sp.fit(X_SP, y_SP)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 20, 1: 6e-05}, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=10, verbose=0, warm_start=False)
[ ]:
# Fora
rf_fora = RandomForestClassifier(random_state=seed,
                                 class_weight={0: 24, 1: 0.000022})
rf_fora.fit(X_fora, y_fora)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 24, 1: 2.2e-05}, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=10, verbose=0, warm_start=False)

XGBoost

[ ]:
# SP
xgboost_sp = xgb.XGBClassifier(max_depth=15,
                               scale_pos_weight=0.15,
                               random_state=seed)

xgboost_sp.fit(X_SP, y_SP)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=10,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=0.15, seed=None,
              silent=None, subsample=1, verbosity=1)
[ ]:
# Fora de SP
xgboost_fora = xgb.XGBClassifier(max_depth=15,
                                 scale_pos_weight=0.02, # 0.05 e 0.01
                                 random_state=seed)

xgboost_fora.fit(X_fora, y_fora)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=10,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=0.02, seed=None,
              silent=None, subsample=1, verbosity=1)

Validação dos modelos

Pré-processamento dos dados de teste

[ ]:
# SP
X_testSP_ = test_preprocessing(X_testSP, enc_SP, norm_SP)
[ ]:
# Fora de SP
X_testFora_ = test_preprocessing(X_testFora, enc_fora, norm_fora)

Random Forest

[ ]:
plot_confusion_matrix(rf_sp, X_testSP_, y_testSP, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
_images/Cancer_Classifiers_113_0.png
[ ]:
plot_feat_importances(rf_sp, X_testSP)
_images/Cancer_Classifiers_114_0.png
[ ]:
plot_confusion_matrix(rf_fora, X_testFora_, y_testFora, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
_images/Cancer_Classifiers_115_0.png
[ ]:
plot_feat_importances(rf_fora, X_testFora)
_images/Cancer_Classifiers_116_0.png

XGBoost

[ ]:
plot_confusion_matrix(xgboost_sp, X_testSP_, y_testSP, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
_images/Cancer_Classifiers_118_0.png
[ ]:
plot_feat_importances(xgboost_sp, X_testSP)
_images/Cancer_Classifiers_119_0.png
[ ]:
plot_confusion_matrix(xgboost_fora, X_testFora_, y_testFora, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
_images/Cancer_Classifiers_120_0.png
[ ]:
plot_feat_importances(xgboost_fora, X_testFora)
_images/Cancer_Classifiers_121_0.png

Label = RECDIST

Dados

[ ]:
df_SP = read_csv('/content/drive/MyDrive/Trabalho/Cancer/Datasets/cancer_SP_labels.csv')
df_fora = read_csv('/content/drive/MyDrive/Trabalho/Cancer/Datasets/cancer_foraSP_labels.csv')
(806402, 94)
(62317, 94)

SP

[ ]:
df_SP.isna().sum().sort_values(ascending=False).head(6)
DTRECIDIVA    717377
delta_t6      717377
delta_t5      717377
delta_t4      717377
IDADE              1
TRATAMENTO         0
dtype: int64
[ ]:
corr_matrix = df_SP.corr()
abs(corr_matrix['RECDIST']).sort_values(ascending = False).head(20)
RECDIST         1.000000
RECNENHUM       0.497837
ob_com_rec      0.427203
vivo_com_rec    0.238029
vivo_sem_rec    0.181668
ano_ob          0.119045
ob_sem_rec      0.113154
QUIMIO          0.109437
ob              0.105117
CATEATEND       0.093630
ANODIAG         0.084918
LATERALI        0.081624
RECLOCAL        0.058266
PERDASEG        0.055281
HORMONIO        0.045904
RRAS            0.043732
IBGEATEN        0.043044
RADIO           0.041206
delta_t5        0.040047
RADIOAPOS       0.038809
Name: RECDIST, dtype: float64

Fora de SP

[ ]:
df_fora.isna().sum().sort_values(ascending=False).head(6)
DTRECIDIVA    57799
delta_t6      57799
delta_t5      57799
delta_t4      57799
NAOTRAT           0
LOCALTNM          0
dtype: int64
[ ]:
corr_matrix = df_fora.corr()
abs(corr_matrix['RECDIST']).sort_values(ascending = False).head(20)
RECDIST         1.000000
RECNENHUM       0.579667
ob_com_rec      0.459205
vivo_com_rec    0.343707
vivo_sem_rec    0.202364
ano_ob          0.120521
QUIMIO          0.107406
ob              0.095742
CATEATEND       0.087712
ob_sem_rec      0.078666
LATERALI        0.073880
DIAGPREV        0.071119
ANODIAG         0.067613
RADIO           0.062403
RECLOCAL        0.050995
PERDASEG        0.047730
HORMONIO        0.046865
delta_t5        0.030107
DIAGTRAT        0.027065
delta_t2        0.027065
Name: RECDIST, dtype: float64

Divisão em treino e teste

[ ]:
df_SP.RECDIST.value_counts()
0    786768
1     19634
Name: RECDIST, dtype: int64
[ ]:
n_samples = 400000
df_SP_sem_rec = df_SP[df_SP.RECDIST == 0].sample(n_samples, random_state=seed).sort_index()
df_SP_rec = df_SP[df_SP.RECDIST == 1]
df_SP_menor = pd.concat([df_SP_rec, df_SP_sem_rec]).sort_index()
df_SP_menor.RECDIST.value_counts()
0    400000
1     19634
Name: RECDIST, dtype: int64
[ ]:
df_fora.RECDIST.value_counts()
0    61211
1     1106
Name: RECDIST, dtype: int64
[ ]:
list_drop = ['UFRESID', 'DTCONSULT', 'DTDIAG', 'DTTRAT', 'DTRECIDIVA', 'DTULTINFO',
              'IDADE', 'PERDASEG', 'CONSDIAG', 'TRATCONS', 'DIAGTRAT', 'RECNENHUM',
              'RECLOCAL', 'RECREGIO', 'REC01', 'REC02', 'REC03', 'REC04', 'delta_t4',
              'delta_t5', 'delta_t6', 'delta_t7', 'delta_t8', 'delta_t9', 'ob', 'ano_ob',
              'ob_com_rec', 'ob_sem_rec', 'vivo_com_rec', 'vivo_sem_rec', 'ULTINFO']

lb = 'RECDIST'

SP

[ ]:
X_trainSP, X_testSP, y_trainSP, y_testSP = get_train_test(df_SP_menor, list_drop, lb)
X_train = (314725, 62), X_test = (104909, 62)
y_train = (314725,), y_test = (104909,)

Fora de SP

[ ]:
X_trainFora, X_testFora, y_trainFora, y_testFora = get_train_test(df_fora, list_drop, lb)
X_train = (46737, 62), X_test = (15580, 62)
y_train = (46737,), y_test = (15580,)

Encoder e normalização

SP

[ ]:
X_trainSP_enc, enc_SP, norm_SP = train_preprocessing(X_trainSP, normalizer='StandardScaler')

Fora de SP

[ ]:
X_trainFora_enc, enc_fora, norm_fora = train_preprocessing(X_trainFora, normalizer='StandardScaler')

PCA

SP

[ ]:
pca = PCA()
principalComponents = pca.fit_transform(X_trainSP_enc)

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=np.linspace(1, X_trainSP_enc.shape[0], X_trainSP_enc.shape[0]),
        y=np.cumsum(pca.explained_variance_ratio_),
        line_shape='hv',

    ))

fig.add_trace(
    go.Bar(
        x=np.linspace(1, X_trainSP_enc.shape[0], X_trainSP_enc.shape[0]),
        y=pca.explained_variance_ratio_
    ))
fig.update_layout(yaxis_title='Variância que pode ser "explicada"', xaxis_title='Número de componentes principais')

fig.show()

Fora de SP

[ ]:
pca = PCA()
principalComponents = pca.fit_transform(X_trainFora_enc)

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=np.linspace(1, X_trainFora_enc.shape[0], X_trainFora_enc.shape[0]),
        y=np.cumsum(pca.explained_variance_ratio_),
        line_shape='hv',

    ))

fig.add_trace(
    go.Bar(
        x=np.linspace(1, X_trainFora_enc.shape[0], X_trainFora_enc.shape[0]),
        y=pca.explained_variance_ratio_
    ))
fig.update_layout(yaxis_title='Variância que pode ser "explicada"', xaxis_title='Número de componentes principais')

fig.show()

Balanceamento dos dados

SP

[ ]:
# Quantidade de valores para cada label nos dados de treino
y_trainSP.value_counts()
0    300054
1     14671
Name: RECDIST, dtype: int64
[ ]:
X_SP, y_SP = SMOTE(random_state=seed).fit_resample(X_trainSP_enc, y_trainSP)
[ ]:
y_SP.shape
(600108,)

Fora de SP

[ ]:
# Quantidade de valores para cada label nos dados de treino
y_trainFora.value_counts()
0    45898
1      839
Name: RECDIST, dtype: int64
[ ]:
X_fora, y_fora = SMOTE(random_state=seed).fit_resample(X_trainFora_enc, y_trainFora)
[ ]:
y_fora.shape
(91796,)

Treinamento dos modelos de Machine Learning

Random Forest

[ ]:
# SP
rf_sp = RandomForestClassifier(random_state=seed, class_weight={0: 0.00003, 1: 20})
rf_sp.fit(X_SP, y_SP)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 3e-05, 1: 20}, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=10, verbose=0, warm_start=False)
[ ]:
# Fora
rf_fora = RandomForestClassifier(random_state=seed, class_weight={0: 0.00001, 1: 24})
rf_fora.fit(X_fora, y_fora)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 1e-05, 1: 24}, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=10, verbose=0, warm_start=False)

XGBoost

[ ]:
# SP
xgboost_sp = xgb.XGBClassifier(max_depth=15,
                               scale_pos_weight=10_000,
                               random_state=seed)

xgboost_sp.fit(X_SP, y_SP)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=10,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=10000, seed=None,
              silent=None, subsample=1, verbosity=1)
[ ]:
# Fora de SP
xgboost_fora = xgb.XGBClassifier(max_depth=15,
                                 scale_pos_weight=300_000,
                                 random_state=seed)

xgboost_fora.fit(X_fora, y_fora)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=10,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=300000, seed=None,
              silent=None, subsample=1, verbosity=1)

Validação dos modelos

Pré-processamento dos dados de teste

[ ]:
# SP
X_testSP_ = test_preprocessing(X_testSP, enc_SP, norm_SP)
[ ]:
# Fora de SP
X_testFora_ = test_preprocessing(X_testFora, enc_fora, norm_fora)

Random Forest

[ ]:
plot_confusion_matrix(rf_sp, X_testSP_, y_testSP, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
_images/Cancer_Classifiers_171_0.png
[ ]:
plot_feat_importances(rf_sp, X_testSP)
_images/Cancer_Classifiers_172_0.png
[ ]:
plot_confusion_matrix(rf_fora, X_testFora_, y_testFora, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
_images/Cancer_Classifiers_173_0.png
[ ]:
plot_feat_importances(rf_fora, X_testFora)
_images/Cancer_Classifiers_174_0.png

XGBoost

[ ]:
plot_confusion_matrix(xgboost_sp, X_testSP_, y_testSP, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
_images/Cancer_Classifiers_176_0.png
[ ]:
plot_feat_importances(xgboost_sp, X_testSP)
_images/Cancer_Classifiers_177_0.png
[ ]:
plot_confusion_matrix(xgboost_fora, X_testFora_, y_testFora, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
_images/Cancer_Classifiers_178_0.png
[ ]:
plot_feat_importances(xgboost_fora, X_testFora)
_images/Cancer_Classifiers_179_0.png